#include <iostream>
#include <time.h>
#include <mpi.h>
#define SIZE 4

int main(void)
{
     const int m = SIZE,n = SIZE,k = SIZE;
     int  comm_sz;        //进程的数量
     int  my_rank;        //进程的编号
     MPI_Init(NULL, NULL);
     MPI_Comm_size(MPI_COMM_WORLD, &comm_sz);	
     MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
     const int row_range = m/comm_sz;
     int A_size = row_range*n;
     int C_size = row_range*k;
     double (*B)[k] = new double [n][k];

     double start, end;
     MPI_Barrier(MPI_COMM_WORLD); 
     start = MPI_Wtime();
     //primary process
     if(my_rank == 0){   
          
          for(int i = 0; i < n; i ++)
               for(int j = 0; j < k; j ++)
	               B[i][j] = i*n+j+1; 

     	double (*A)[n] = new double [m][n];  
          double (*C)[k] = new double [m][k];  
     	//initial
     	for(int i = 0; i < m; i ++)
               for(int j = 0; j < n; j ++)
                    A[i][j] = i*m+j+1;
            
        //send to child process
          for(int i = 1 ; i < comm_sz; i++){
	          MPI_Send(&B[0][0], n*k, MPI_DOUBLE, i, i, MPI_COMM_WORLD); 
	     }
          

          for(int i = 1 ; i < comm_sz; i++){
               
	          MPI_Send(&A[i*row_range][0], A_size, MPI_DOUBLE, i, i+comm_sz, MPI_COMM_WORLD); 
	     }
          
          for (int i = 0; i < row_range; i ++){
               for (int j = 0; j < k; j ++){
                    C[i][j] = 0;
                    for (int l = 0; l < n; l ++){
                         C[i][j] += A[i][l] * B[l][j];
                    }
               }
          }
          for (int i = 1; i < comm_sz; i++){
               MPI_Recv(&C[i*row_range][0], C_size, MPI_DOUBLE, i, i + 2*comm_sz, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
          } 
          if(SIZE <= 8){
              for (int i = 0; i < m; i ++){
                for (int j = 0; j < k; j ++){
                    printf("%lf ", C[i][j]);
                }
               putchar('\n');
            }
          }
              
	     
          delete [] A;
          delete [] C;
          
     }else{
          double (*buf_A)[n] = new double [row_range][n];  
          double (*buf_C)[k] = new double [row_range][k];  
     	MPI_Recv(&B[0][0], n*k, MPI_DOUBLE, 0, my_rank,   MPI_COMM_WORLD, MPI_STATUS_IGNORE);
          MPI_Recv(&buf_A[0][0], A_size, MPI_DOUBLE, 0, my_rank+comm_sz,   MPI_COMM_WORLD, MPI_STATUS_IGNORE);
          for (int i = 0; i < row_range; i ++){
               for (int j = 0; j < k; j ++){
                    buf_C[i][j] = 0;
                    for (int l = 0; l < n; l ++){
                         buf_C[i][j] += B[l][j] * buf_A[i][l];
                    }
               }
          }
          MPI_Send(&buf_C[0][0], C_size, MPI_DOUBLE, 0, my_rank+2*comm_sz, MPI_COMM_WORLD); 
          delete [] buf_C;
          delete [] buf_A;
     }
     MPI_Barrier(MPI_COMM_WORLD); 
     end = MPI_Wtime();
     if (my_rank == 0)printf("%lf s", (double)(end - start));
     MPI_Finalize();		//MPI使用完毕时，可以进行释放
     delete [] B;
     
     return 0;
}